The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Object recognition
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
# Libraries needed for scientific computation, data analysis and manipulation
import numpy as np
import pandas as pd
# Libraries needed for statistical data visualization
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# Initialize dataframe with Vehicle dataset
main_df = pd.read_csv("vehicle.csv")
main_df.head()
row_count, column_count = main_df.shape
print(f'No. of rows: {row_count}')
print(f'No. of columns: {column_count}')
main_df.info()
Observations
main_df.isnull().apply(pd.value_counts)
# function to print and return column names that contain null values
def get_columns_with_null_values(df):
print('Columns with null values:')
print('-------------------------')
columns_with_null_values = []
ctr = 0
for col in df.columns:
null_count = df[col].isnull().sum()
if null_count > 0:
columns_with_null_values.append(col)
ctr = ctr + 1
print(f'{ctr}. {col} - {null_count}')
return columns_with_null_values
# function to print and return column names that contain zero(0) values
def get_columns_with_zero_values(df):
print('Columns with zero values:')
print('-------------------------')
columns_with_zero_values = []
ctr = 0
for col in df.columns:
col_type = df[col].dtypes
if col_type != 'object':
zero_count = df[main_df[col]==0][col].count()
if zero_count > 0:
columns_with_zero_values.append(col)
ctr = ctr + 1
print(f'{ctr}. {col} - {zero_count}')
return columns_with_zero_values
# function to impute null values with median value
def impute_null_values(df, columns):
copied_df = df.copy()
for col in columns:
value = copied_df[col].median()
copied_df[col] = copied_df[col].fillna(value)
return copied_df
columns_with_null_values = get_columns_with_null_values(main_df)
columns_with_null_values
columns_with_zero_values = get_columns_with_zero_values(main_df)
columns_with_zero_values
data_df = impute_null_values(main_df, columns_with_null_values)
plt.figure(figsize=(10, 4))
sns.heatmap(main_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.figure(figsize=(10, 4))
sns.heatmap(data_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
Observations
data_df.info()
data_df.describe().transpose()
data_df.columns
# separate independent and dependent variables
independent_features = [
'compactness', 'circularity', 'distance_circularity',
'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio',
'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity',
'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1',
'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about',
'skewness_about.1', 'skewness_about.2', 'hollows_ratio'
]
independent_feature_set_1 = [
'compactness', 'circularity', 'distance_circularity',
'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio',
'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity',
]
independent_feature_set_2 = [
'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1',
'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about',
'skewness_about.1', 'skewness_about.2', 'hollows_ratio'
]
dependent_feature = 'class'
# function to plot the distribution of numeric columns
def plot_distribution(df, columns):
for col in columns:
sns.displot(data=df, kind='hist', x=col, bins=20, kde=True)
# function to plot the boxplot distribution of numeric columns
def plot_boxplot(df, x_column, y_columns):
plt.figure(figsize=(25, 25))
index = 0
for col in y_columns:
index = index + 1
plt.subplot(3, 3, index)
sns.boxplot(x=x_column, y=col, data=df)
# function to plot correlation using a heatmap
def plot_correlation(df):
corr = df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr, annot=True, fmt='.2f')
# function to plot distribution and box plots for a given set of columns
def plot_dist_and_box(df, columns):
for col in columns:
fig, (g1, g2) = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(15, 3)
sns.histplot(x=df[col], ax=g1, kde=True, bins=20)
g1.set_title(f'Distribution: {col}')
sns.boxplot(x=df[col], ax=g2)
g2.set_title(f'Box Plot: {col}')
# function to return highly correlated columns
def get_corr_features(df, threshold):
corr_columns = set()
corr_matrix = df.corr()
ctr = 0
for i in range(len(corr_matrix.columns)):
for j in range(i):
corr_value = corr_matrix.iloc[i, j]
if corr_value >= threshold:
col_name_i = corr_matrix.columns[i]
col_name_j = corr_matrix.columns[j]
corr_columns.add(col_name_i)
ctr += 1
print(f'{ctr}. \'{col_name_i}\' and \'{col_name_j}\' are highly correlated (Correlation value = {corr_value})')
return corr_columns
# function to identify and treat outlier columns
def treat_outliers(df, columns):
copied_df = df.copy()
for col in columns:
q1 = copied_df[col].quantile(0.25)
q3 = copied_df[col].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
value = copied_df[col].median()
copied_df.loc[ (copied_df[col] < low) | (copied_df[col] > high), col] = value
return copied_df
plot_dist_and_box(data_df, independent_features)
Observations
# plot_boxplot(data_df, dependent_feature, independent_feature_set_1)
# plot_boxplot(data_df, dependent_feature, independent_feature_set_2)
columns_with_outliers = [
'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance',
'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1'
]
treated_data_df = treat_outliers(data_df, columns_with_outliers)
plot_dist_and_box(treated_data_df, independent_features)
treated_data_df[dependent_feature].value_counts()
plot_distribution(treated_data_df, [dependent_feature])
sns.countplot(data=treated_data_df, x=dependent_feature)
Observations
X = treated_data_df[independent_features]
y = treated_data_df[dependent_feature]
X.shape
y.shape
X.head()
y.head()
plot_correlation(X)
# get all columns that are correlated greater than 90%
highly_corr_features = get_corr_features(X, 0.90)
highly_corr_features
sns.pairplot(X, diag_kind='kde')
Observations
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# function to scale the dataset
def get_scaled_df(df):
scaler = StandardScaler()
scaler_result = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaler_result, columns=list(df.columns))
return scaled_df
scaled_X = get_scaled_df(X)
scaled_X.head()
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.3, random_state=10)
X_train.shape
y_train.shape
X_test.shape
y_test.shape
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def init_svc_classifier():
return SVC(gamma='auto')
def train_and_predict(model_name, model, X_train, y_train, X_test, y_test):
print(f'MODEL: {model_name}\n')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score_train = round(model.score(X_train, y_train), 4) * 100
print(f'Train Accuracy Score: {score_train}')
score_test = round(model.score(X_test, y_test), 4) * 100
print(f'Test Accuracy Score: {score_test}')
model_dict = {
'y_pred': y_pred,
'model_score_train': score_train,
'model_score_test': score_test
}
return model_dict
def get_metrics(model_name, y_test, y_pred):
print('---------------------')
print('Classification Report')
print('---------------------')
cr = classification_report(y_test, y_pred)
print(cr)
print('----------------')
print('Confusion Matrix')
print('----------------')
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True, fmt='g', cbar=False, cmap='YlOrRd')
accuracy = round(accuracy_score(y_test, y_pred), 4) * 100
f1 = round(f1_score(y_test, y_pred, average='micro'), 4) * 100
precision = round(precision_score(y_test, y_pred, average='micro'), 4) * 100
recall = round(recall_score(y_test, y_pred, average='micro'), 4) * 100
score_dict = {
'accuracy': accuracy,
'f1': f1,
'precision': precision,
'recall': recall
}
return score_dict
model = init_svc_classifier()
model_dict_svm = train_and_predict('SVM', model, X_train, y_train, X_test, y_test)
score_dict_svm = get_metrics('SVM', y_test, model_dict_svm['y_pred'])
model_dict_svm
score_dict_svm
from sklearn.model_selection import KFold, cross_val_score
def cross_validate(model, X, y):
num_folds = 10
seed = 10
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
scores = cross_val_score(model, X, y, cv=kfold)
cv_score_dict = {
'mean': round(scores.mean(), 4) * 100,
'std': round(scores.std(), 4) * 100,
'min': round(scores.min(), 4) * 100,
'max': round(scores.max(), 4) * 100
}
print('-------------------------------')
print('Cross Validation Accuracy Score')
print('-------------------------------')
print(f"1. Mean: {cv_score_dict['mean']}")
print(f"2. Std. Dev.: {cv_score_dict['std']}")
print(f"3. Min: {cv_score_dict['min']}")
print(f"4. Max: {cv_score_dict['max']}")
return cv_score_dict
model = init_svc_classifier()
cv_score_svm = cross_validate(model, scaled_X, y)
cv_score_svm
scaled_X.head()
scaled_X.shape
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(scaled_X)
# eigen values
pca.explained_variance_
# eigen vectors
pca.components_
# variance ratio
pca.explained_variance_ratio_
# plot eigen values
plt.bar(list(range(1,19)), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.ylabel('Variation Explained')
plt.xlabel('Eigen Value')
cumulative_sum = np.cumsum(pca.explained_variance_ratio_)
cumulative_sum
# plot using step function
plt.step(list(range(1,19)), cumulative_sum, where = 'mid')
plt.ylabel('Cumulative Variation Explained')
plt.xlabel('Eigen Value')
Observations
pca = PCA(n_components=8)
pca_result = pca.fit_transform(scaled_X)
pca_X = pd.DataFrame(pca_result)
pca_X.shape
y.shape
plot_correlation(pca_X)
sns.pairplot(pca_X, diag_kind='kde')
Observations
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(pca_X, y, test_size=0.3, random_state=10)
pca_X_train.shape
pca_y_train.shape
pca_X_test.shape
pca_y_test.shape
model = init_svc_classifier()
model_dict_svm_pca = train_and_predict('SVM with PCA', model, pca_X_train, pca_y_train, pca_X_test, pca_y_test)
score_dict_svm_pca = get_metrics('SVM with PCA', pca_y_test, model_dict_svm_pca['y_pred'])
model_dict_svm_pca
score_dict_svm_pca
model = init_svc_classifier()
cv_score_svm_pca = cross_validate(model, pca_X, y)
cv_score_svm_pca
accuracy_data = [
[
'SVM - All', model_dict_svm['model_score_train'], model_dict_svm['model_score_test'],
score_dict_svm['accuracy'], score_dict_svm['f1'], score_dict_svm['precision'], score_dict_svm['recall']
],
[
'SVM - PCA', model_dict_svm_pca['model_score_train'], model_dict_svm_pca['model_score_test'],
score_dict_svm_pca['accuracy'], score_dict_svm_pca['f1'], score_dict_svm_pca['precision'], score_dict_svm_pca['recall']
]
]
accuracy_score_df = pd.DataFrame(
accuracy_data, columns = ['Model', 'Train Score', 'Test Score', 'Accuracy', 'F1', 'Precision', 'Recall']
)
cv_data = [
[ 'SVM - All', cv_score_svm['mean'], cv_score_svm['std'], cv_score_svm['min'], cv_score_svm['max'] ],
[ 'SVM - PCA', cv_score_svm_pca['mean'], cv_score_svm_pca['std'], cv_score_svm_pca['min'], cv_score_svm_pca['max'] ]
]
cv_score_df = pd.DataFrame(
cv_data, columns = ['Model', 'Mean', 'Std. Dev.', 'Min', 'Max']
)
accuracy_score_df
cv_score_df
Observations & Conclusion